Inside of this project we will observe data from both Craigslist postings in Davis, CA and Berkeley, CA. Since these are both cities that have a well-known college and are also local, I wanted to explore the difference in pricing in living accommodations and see just how different the average living situations would be for people living in both cities.

Since Craigslist has changed their posts to dynamically update I will be downloading all of the HTML files for 360 posts for both Davis and Berkeley. Eight of the posts for Davis were invalid (didn’t contain enough information on them to be extracted) and we will use these 352 posts to compare with the 360 from Berkeley.

These functions are for extracting the information from a full craigslist apartment post.

orNA <- function(x) {
  if (length(x) == 0) return(NA)
  return(x)
}


procPost =
function(url, html = readLines(url), doc = htmlParse(html))
{
    # Can get title from breadcrumb json, <meta property="og:title"> or <title> in <head>
    title = xpathSApply(doc, "/html/head/title", xmlValue)
    title = gsub("- apts/housing for rent.*", "", title)
    

    json = xpathSApply(doc, "/html/head/script[@id = 'ld_posting_data']", xmlValue)
    meta = fromJSON(json)

    # The address is not a single value and may vary from post to post. Additionally, some don't have any so this will extract the value and remove it from meta and store it in a seperate variable if there is an address and will just simply set address to NA if its not.  
    i = match("address", names(meta))
    if (!is.na(i)) {
      address = meta[[i]]
      meta = meta[-i]
    } else {
      address = NA
    }
    
    attrs = c("parking", "laundry", "pets_dog", "pets_cat", "rent_period",
              "airconditioning", "application_fee_explained", "no_smoking", "wheelaccess")
    info = lapply(attrs, getAttrValue, doc)
    names(info) = attrs

    body = getNodeSet(doc, "//section[@id = 'postingbody']")
    body = xmlValue(body[[1]])
    

    datePosted = getNodeSet(doc, "//div[@class = 'postinginfos']//time[contains(@class, 'timeago')]/@datetime")
    rent <- orNA(xpathSApply(doc, "//span[@class='price']", xmlValue)[1])
    
    specs_raw <- xpathSApply(doc, "//span[@class='housing']", xmlValue)

    # Try to extract sqft from the specs of the post
    sqft_match <- regmatches(specs_raw, regexpr("[0-9]+\\s*ft2", specs_raw))
    sqft <- orNA(as.numeric(gsub("\\s*ft2", "", sqft_match)))

    
    ans = cbind(data.frame(body = body,
                           title = title,
                           rent = rent,
                           Sqft = sqft,
                           datePosted = orNA(datePosted[[1]])
                           ),
                as.data.frame(meta),
                as.data.frame(info))

#if("smokingAllowed" %in% names(ans)) browser()    

    ans$address = list(address)

    ans
}


getAttrValue =
function(what, doc)    
{
    ans = xpathSApply(doc, sprintf("//div[@class = 'attrgroup']//div[contains(@class, '%s')]/span[@class = 'valu']", what), xmlValue, trim = TRUE)
    if(length(ans) == 0)
        ans = xpathSApply(doc, sprintf("//div[@class = 'attrgroup']//div[@class = 'attr' and not(div[contains(@class, '%s')])]/span[@class = 'valu']/a[contains(@href, '%s')]", what, what), xmlValue, trim = TRUE)

    orNA(ans)
}


# Load all .html files from a folder
loadCraigslistFolder <- function(folder) {
  files <- list.files(folder, full.names = TRUE, pattern = "\\.html$")
  htmlDocs <- lapply(files, function(f) {
    doc <- htmlParse(f, encoding = "UTF-8")
    procPost(f, html = readLines(f), doc = doc)
  })
  do.call(rbind.fill, htmlDocs)
}

Now we will run these functions on both our Davis data (labeled “Sacramento Data” since that is how Craigslist is identifying the area surrounding Davis) and also our San Francisco data.

dirtySacramentoData <- loadCraigslistFolder("craigslist_sacramento")
dirtySfData <- loadCraigslistFolder("craigslist_berkeley")

However we will now need to clean this data since the format and data types don’t come ready for analysis in R.

cleanData <- function(df) {
  #This first part of the function is to get rid of the columns that are automatically created when you call as.data.frame() on a listed that includes nested elements, but it does not contain any valuable data for our analysis. 
  df$X.context <- NULL
  
  # Renaming for clarity
  names(df)[names(df) == "X.type"] <- "Type"
  
  # Converting all appropriate columns to numeric for analysis
  df$numberOfBedrooms <- as.numeric(df$numberOfBedrooms)
  df$numberOfBathroomsTotal <- as.numeric(df$numberOfBathroomsTotal)
  df$Sqft <- as.numeric(df$Sqft)
  df$rent <- as.numeric(gsub("[$,]", "", df$rent))
  df$latitude <- as.numeric(df$latitude)
  df$longitude <- as.numeric(df$longitude)
  
  df$petsAllowed[is.na(df$petsAllowed)] <- TRUE
  df$hasAC <- !is.na(df$airconditioning)
  
  return(df)
}
sacramentoData <- cleanData(dirtySacramentoData)
sfData <- cleanData(dirtySfData)

In order to view plots to draw insights we will first needs to change these columns to the proper class (numeric in the case of number of bedrooms/bathrooms). Then we can verify if our data is valid by looking if these values of bedrooms/bathrooms make sense.

sacBeds <- ggplot(sacramentoData, aes(x = numberOfBedrooms)) +
  geom_histogram(binwidth = 1, fill = "red", color = "black") +
  labs(title = "Histogram of Number of Bedrooms in Davis",
       x = "Number of Bedrooms",
       y = "Frequency") +
  theme_minimal() + 
  theme(plot.title = element_text(size = 11))


sacBaths <- ggplot(sacramentoData, aes(x = numberOfBathroomsTotal)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Histogram of Number of Bathrooms in Davis",
       x = "Number of Bathrooms",
       y = "Frequency") +
  theme_minimal() + 
  theme(plot.title = element_text(size = 11))


sfBeds <- ggplot(sfData, aes(x = numberOfBedrooms)) +
  geom_histogram(binwidth = 1, fill = "red", color = "black") +
  labs(title = "Histogram of Number of Bedrooms in Berkeley",
       x = "Number of Bedrooms",
       y = "Frequency") +
  theme_minimal() + 
  theme(plot.title = element_text(size = 11))



sfBaths <- ggplot(sfData, aes(x = numberOfBathroomsTotal)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Histogram of Number of Bathrooms in Berkeley",
       x = "Number of Bathrooms",
       y = "Frequency") +
  theme_minimal() + 
  theme(plot.title = element_text(size = 11))


grid.arrange(sacBeds, sacBaths, sfBeds, sfBaths, nrow = 2)
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

To start, values seem to make sense and there is no values that we wouldn’t expect to be here so we can assume that we scraped the data correctly. Now it seems that most of the housing in Berkeley seems to be comparatively smaller since the data on bedrooms is skewed right (there is more often a smaller number of bedrooms). This also coincides with the number of bathrooms since smaller living situations typically have one bathroom, and one is the mode for the number of bathrooms in Berkeley (most common value is 1). This is also true for Davis, but you can see that the frequency of having two or more bathrooms in Davis is over double that of Berkeley.

sacRentPlot <- ggplot(sacramentoData, aes(x = Sqft, y = rent)) +
  geom_point(color = "blue") +
  labs(title = "Davis : Rent vs. Square Footage",
       x = "Square Footage (sqft)",
       y = "Rent ($)") +
  ylim(0, 14000) +  #Chose 14,000 for both of them here since $14,000 is the biggest value in both data sets
  xlim(0,3700) + #Similar sort of logic for 3,700 here.
  theme_minimal()

sfRentPlot <- ggplot(sfData, aes(x = Sqft, y = rent)) +
  geom_point(color = "darkgreen") +
  labs(title = "Berkeley : Rent vs. Square Footage",
       x = "Square Footage (sqft)",
       y = "Rent ($)") +
   ylim(0, 14000) + 
   xlim(0,3700) +   
  theme_minimal()

ggplotly(sacRentPlot)
ggplotly(sfRentPlot)

Using the above plot we can quickly see that the rent is generally going to be higher in Berkeley, but I am going to also conduct a statistical analysis to observe if this difference is significant.

I want to first observe if there is a difference in square foot per dollar to account for bigger places costing more. In reality, a potential customer doesn’t mind a significantly larger apartment to cost more than one that is comparatively smaller. However, I want to see if the cost of living is overall more expensive in Berkeley when you properly account for the size of the apartment as well.

sacramentoData$sqftPerDollar <- sacramentoData$Sqft / sacramentoData$rent
sfData$sqftPerDollar <- sfData$Sqft / sfData$rent

# Remove NAs for analysis
sac_clean <- sacramentoData[!is.na(sacramentoData$sqftPerDollar), ]
sf_clean <- sfData[!is.na(sfData$sqftPerDollar), ]

# Labeling each with the perspective city to make it clearer. 
sac_clean$city <- "Davis"
sf_clean$city <- "Berkeley"
combined <- rbind(sac_clean, sf_clean)

ggplot(combined, aes(x = city, y = sqftPerDollar, fill = city)) +
  geom_boxplot() +
  labs(title = "Square Footage per Dollar by City",
       x = "City",
       y = "Square Feet per Dollar") +
  theme_minimal()

t.test(sqftPerDollar ~ city, data = combined)
## 
##  Welch Two Sample t-test
## 
## data:  sqftPerDollar by city
## t = -8.997, df = 504.81, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Berkeley and group Davis is not equal to 0
## 95 percent confidence interval:
##  -0.12756783 -0.08183941
## sample estimates:
## mean in group Berkeley    mean in group Davis 
##              0.3013338              0.4060374

Since this p-value is approximately equal to zero (p < 0.05), we reject the null hypothesis that there is no difference in square footage per dollar between Davis and Berkeley. This result provides statistical evidence that there is a difference in the average amount of space one gets per dollar of rent between the two cities.

From our sample, the mean square footage per dollar in Davis is higher than in Berkeley, which suggests that renters in Davis tend to receive more living space for the same price. Specifically, Davis listings averaged 0.406 sq ft per dollar, while Berkeley listings averaged 0.301 sq ft per dollar.

pal <- colorNumeric(palette = "YlOrRd", domain = sacramentoData$rent)



# Create the map
leaflet(data = sacramentoData) %>%
  addTiles() %>%
  addCircleMarkers(~longitude, ~latitude,
                   color = ~pal(rent),
                   radius = 5,
                   stroke = FALSE,
                   fillOpacity = 0.7,
                   popup = ~paste0("<strong>", title, "</strong><br>",
                                   "Rent: $", rent, "<br>",
                                   "Sqft: ", Sqft)) %>%
  addLegend("bottomright", pal = pal, values = ~rent,
            title = "Rent ($)",
            opacity = 1)
leaflet(data = sfData) %>%
  addTiles() %>%
  addCircleMarkers(~longitude, ~latitude,
                   color = ~pal(rent),
                   radius = 5,
                   stroke = FALSE,
                   fillOpacity = 0.7,
                   popup = ~paste0("<strong>", title, "</strong><br>",
                                   "Rent: $", rent, "<br>",
                                   "Sqft: ", Sqft)) %>%
  addLegend("bottomright", pal = pal, values = ~rent,
            title = "Rent ($)",
            opacity = 1)
## Warning in pal(rent): Some values were outside the color scale and will be
## treated as NA
## Warning in pal(rent): Some values were outside the color scale and will be
## treated as NA
## Warning in pal(c(r[1], cuts, r[2])): Some values were outside the color scale
## and will be treated as NA

As you can see the woodland housing accommodations are generally much cheaper due to their farther distance from the university. Additionally, we can observe there is certainly a premium that the customer pays when living closer to Berkeley, but there is simply a larger amount of apartments in the orange color than Davis (in the approximately $3,000 per month range).

Now we will observe if housing accommodations that allow pets are more expensive. Generally it is believed that pet-friendly living is more expensive since the demand for this is high and therefore that drives up the typical cost. However, since Davis is a college tower and there is less pets I want to observe if there is any difference in the price of living for pet-friendly vs. non pet-friendly housing. Additionally, in this analysis if the posting didn’t have an explicit mentioning of “no pets allowed” then

# First, keep only rows where rent is available
petDataSac <- sacramentoData[!is.na(sacramentoData$rent), ]

# Create petsAllowed as a factor (TRUE if explicitly allows pets, FALSE if explicitly says no pets)
petDataSac$petsAllowed <- ifelse(!is.na(petDataSac$pets_cat) | !is.na(petDataSac$pets_dog), TRUE, FALSE)

# Make sure it's treated as a factor
petDataSac$petsAllowed <- factor(petDataSac$petsAllowed, levels = c(FALSE, TRUE))

table(petDataSac$petsAllowed)
## 
## FALSE  TRUE 
##    22   331
ggplot(petDataSac, aes(x = petsAllowed, y = rent, fill = petsAllowed)) +
  geom_boxplot() +
  labs(title = "Rent Comparison: Pet-Friendly vs. Not",
       x = "Pets Allowed",
       y = "Rent ($)") +
  theme_minimal()

aggregate(rent ~ petsAllowed, data = petDataSac, FUN = mean)
##   petsAllowed     rent
## 1       FALSE 2159.182
## 2        TRUE 2328.408
t.test(rent ~ petsAllowed, data = petDataSac)
## 
##  Welch Two Sample t-test
## 
## data:  rent by petsAllowed
## t = -1.2759, df = 25.868, p-value = 0.2133
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -441.9299  103.4778
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            2159.182            2328.408

Even though pet-friendly units in Davis have a slightly higher average rent ($2159.18 vs. $2328.41), the difference is small and not statistically significant. Based on this sample, we cannot conclude that allowing pets affects rent.

I will now observe Berkeley and see if the conclusion is any different.

#Same logic as before
petDataSF <- sfData[!is.na(sfData$rent), ]

petDataSF$petsAllowed <- ifelse(!is.na(petDataSF$pets_cat) | !is.na(petDataSF$pets_dog), TRUE, FALSE)

petDataSF$petsAllowed <- factor(petDataSF$petsAllowed, levels = c(FALSE, TRUE))

table(petDataSF$petsAllowed)
## 
## FALSE  TRUE 
##    96   264
ggplot(petDataSF, aes(x = petsAllowed, y = rent, fill = petsAllowed)) +
  geom_boxplot() +
  labs(title = "Berkeley: Rent Comparison - Pet-Friendly vs. Not",
       x = "Pets Allowed",
       y = "Rent ($)") +
  theme_minimal()

aggregate(rent ~ petsAllowed, data = petDataSF, FUN = mean)
##   petsAllowed     rent
## 1       FALSE 2755.521
## 2        TRUE 2599.523
t.test(rent ~ petsAllowed, data = petDataSF)
## 
##  Welch Two Sample t-test
## 
## data:  rent by petsAllowed
## t = 0.73013, df = 121.93, p-value = 0.4667
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -266.9636  578.9598
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            2755.521            2599.523

There is a different conclusion with the city of Berkeley, where if pets are allowed the average price of rent is $2599.52 and if they aren’t allowed the rent is on average $2755.52. This is likely true since we didn’t account for things such as prestige of living situations and some living situations that present themselves as luxurious would likely not allow pets, but would still be willing to charge you premium rent.

Now I’m interested in testing if having air conditioning on your posting will result in higher rent prices on average.

ggplot(sacramentoData, aes(x = hasAC, y = rent, fill = hasAC)) +
  geom_boxplot() +
  labs(title = "Rent by Air Conditioning Availability",
       x = "Has Air Conditioning",
       y = "Rent ($)") +
  theme_minimal()

aggregate(rent ~ hasAC, data = sacramentoData, FUN = mean, na.rm = TRUE)
##   hasAC     rent
## 1 FALSE 2226.621
## 2  TRUE 2347.703
t.test(rent ~ hasAC, data = sacramentoData)
## 
##  Welch Two Sample t-test
## 
## data:  rent by hasAC
## t = -1.6892, df = 255.64, p-value = 0.09241
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -262.24524   20.08061
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            2226.621            2347.703

The p-value for the t-test is greater than 0.05 (0.09241 > 0.05) so we can conclude there isn’t significant evidence to conclude that AC has a real effect on the amount rent. Units with AC do on average have a higher rent ($2347.70 vs. $2226.62), however the difference is not statistically significant.

ggplot(sfData, aes(x = hasAC, y = rent, fill = hasAC)) +
  geom_boxplot() +
  labs(title = "Berkeley: Rent by Air Conditioning Availability",
       x = "Has Air Conditioning",
       y = "Rent ($)") +
  theme_minimal()

aggregate(rent ~ hasAC, data = sfData, FUN = mean, na.rm = TRUE)
##   hasAC     rent
## 1 FALSE 2597.611
## 2  TRUE 2831.403
t.test(rent ~ hasAC, data = sfData)
## 
##  Welch Two Sample t-test
## 
## data:  rent by hasAC
## t = -1.4825, df = 134.33, p-value = 0.1406
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -545.70071   78.11659
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            2597.611            2831.403

There is once again not enough statistical evidence to conclude that having air conditioning significantly affects the price of rent in this Berkeley data set. However, in this sample the average rent was $2831.40 vs. $2597.61, which indicates there was still an average difference of about $234.

Now finally I want to answer the question “Which features are most predictive of rent?”. Will do this by constructing a model and viewing what predictors are contributing the most to our model. We will first take a look at the Davis data and then look at our Berkeley data to see if there is any difference in what contributes to an increase in rent.

modelSac <- lm(rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + petsAllowed + hasAC + parking + laundry, data = sacramentoData)
summary(modelSac)
## 
## Call:
## lm(formula = rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + 
##     petsAllowed + hasAC + parking + laundry, data = sacramentoData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2038.72  -204.14     8.62   181.54  1641.37 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                648.2353   216.4731   2.995 0.003014 ** 
## Sqft                         1.2970     0.1875   6.917 3.57e-11 ***
## numberOfBedrooms           152.6709    51.8242   2.946 0.003512 ** 
## numberOfBathroomsTotal    -120.4971    59.8898  -2.012 0.045253 *  
## petsAllowedTRUE                  NA         NA      NA       NA    
## hasACTRUE                  289.5197    80.3033   3.605 0.000374 ***
## parkingcarport             253.1707   134.5101   1.882 0.060930 .  
## parkingdetached garage    -237.9578   402.8213  -0.591 0.555216    
## parkingoff-street parking -179.8169   143.1972  -1.256 0.210342    
## parkingstreet parking       31.4479   270.5445   0.116 0.907553    
## laundrylaundry on site      48.0866   131.7206   0.365 0.715359    
## laundryno laundry on site   44.6844   467.4922   0.096 0.923925    
## laundryw/d hookups        -255.3812   214.4740  -1.191 0.234844    
## laundryw/d in unit          96.6656   107.6945   0.898 0.370234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 377.7 on 260 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.7789, Adjusted R-squared:  0.7687 
## F-statistic: 76.35 on 12 and 260 DF,  p-value: < 2.2e-16

Observing our column in the table above labeled “Estimate” will show us which of these predictors have the most impact on the price of rent. For example, if we observe the “numberOfBedrooms” variable we can see that for every additional bedroom adds roughly $152.67 to the rent and that logic can be applied to all of our other variables. It is also important to note that these values are looking at the increase/decreas in rent when we hold all other variables constant, whereas before the T-test we are conducting didn’t. We can then see that the largest contributors to the increase of the rent are larger square footage ($1.30 per every additional square ft.), having AC (which increases the rent by $289.52) and having a car port, when compared to having an attached garage or street/off-street parking, costs roughly $253.17 more per month. Additionally, this model suggests that having an apartment with just hookups for a washer/dryer (not having the machines themselves and being required to bring your own) is going to result in a rent that is $255.38 cheaper with every other variable being held constant. This means that when compared to all the other variables, if someone were to want apartments with cheaper rent in Davis they should look for places that only have washer/dryer hookups and purchase their own.

modelSf <- lm(rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + petsAllowed + hasAC + parking + laundry, data = sfData)
summary(modelSf)
## 
## Call:
## lm(formula = rent ~ Sqft + numberOfBedrooms + numberOfBathroomsTotal + 
##     petsAllowed + hasAC + parking + laundry, data = sfData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2164.67  -376.03   -27.97   333.52  2872.77 
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                118.8600   179.9895   0.660 0.509931    
## Sqft                         2.9099     0.2851  10.207  < 2e-16 ***
## numberOfBedrooms           365.6463   119.1951   3.068 0.002520 ** 
## numberOfBathroomsTotal    -445.8891   128.5240  -3.469 0.000665 ***
## petsAllowedTRUE                  NA         NA      NA       NA    
## hasACTRUE                  202.4131   130.2453   1.554 0.122067    
## parkingcarport             269.6585   148.9439   1.810 0.072031 .  
## parkingdetached garage    -266.2639   180.5343  -1.475 0.142143    
## parkingno parking         -105.2348   411.6542  -0.256 0.798546    
## parkingoff-street parking  -91.9404   148.4187  -0.619 0.536459    
## parkingstreet parking     -178.2395   176.6905  -1.009 0.314555    
## laundrylaundry on site     112.8019   176.0858   0.641 0.522661    
## laundryno laundry on site  189.3900   324.0855   0.584 0.559756    
## laundryw/d hookups        -420.0608   300.4113  -1.398 0.163893    
## laundryw/d in unit         305.3260   130.5047   2.340 0.020496 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 673.5 on 166 degrees of freedom
##   (180 observations deleted due to missingness)
## Multiple R-squared:  0.7818, Adjusted R-squared:  0.7647 
## F-statistic: 45.74 on 13 and 166 DF,  p-value: < 2.2e-16